/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.eclipse.core.resources.IStorage;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.core.runtime.IPath;
/**
* Indexes documents of type IStorage
*/
public class StorageIndexer
{
/** Maximum number of terms indexed per one document */
private static final int MAX_TERMS_PER_DOC = 200000;
/** */
public static final String NO_VALUE = "<none>";
/** */
public static final int MIN_WORD_LENGTH = 1;
protected static final FileAnalyzer fileAnalyzer = new FileAnalyzer(MIN_WORD_LENGTH);
private IndexChangeListener changeListener = new NullIndexChangeListener();
private static final Similarity similarity = new LengthNormSimilarity();
private static final int MAX_RETRY_ATTEMPTS = 10;
private Directory indexDir;
/**
* @throws IOException
*
*/
public StorageIndexer() throws IOException
{
checkLock();
}
private void checkLock() throws IOException
{
Directory indexDir = getIndexDir();
if( IndexWriter.isLocked(indexDir) ) // should not be locked at startup, unlock
IndexWriter.unlock(indexDir);
}
public Directory getIndexDir() throws IOException
{
if( indexDir == null ) indexDir = new RAMDirectory();
return indexDir;
}
/**
* @param create index
* @return IndexWriter
* @throws IOException
*/
public IndexWriter createIndexWriter(boolean create) throws IOException
{
IndexWriter indexWriter = new IndexWriter(getIndexDir(), fileAnalyzer, create, MaxFieldLength.UNLIMITED);
indexWriter.setMergeFactor(2); // use less resources (although slower)
indexWriter.setSimilarity(similarity);
indexWriter.setMaxFieldLength(MAX_TERMS_PER_DOC);
return indexWriter;
}
/**
* @return isIndexed
* @throws IOException
*/
public boolean isIndexed() throws IOException
{
return IndexReader.indexExists(getIndexDir());
}
/**
* Check if the index can be read
*
* @return whether the index is readable
*/
public boolean isReadable(){
try {
IndexReader reader = IndexReader.open(getIndexDir(), true);
reader.close();
} catch (IOException readingException) {
return false;
}
return true;
}
/**
* Delethe the whole index
* @throws Exception
*/
public void deleteIndex() throws Exception {
RetryingRunnable runnable = new RetryingRunnable()
{
public void run() throws Exception
{
IndexWriter w = createIndexWriter(true); // open for writing and close (make empty)
w.deleteAll();
w.commit();
w.close(true);
Directory dir = getIndexDir();
for(String file: dir.listAll())
{
if( dir.fileExists(file) ) // still exits
{
dir.sync(file);
dir.deleteFile(file);
}
}
dir.close();
}
public boolean handleException(Throwable e)
{
return true;
}
};
changeListener.onIndexReset(); // close searcher because index is deleted
runRetryingRunnable(runnable); // delete index with retry
}
/**
* @throws Exception
*/
public void optimizeIndex() throws Exception {
if( ! isIndexed() )
return;
IndexWriter w = createIndexWriter(false);
w.optimize();
w.close();
changeListener.onIndexUpdate();
}
/**
* @param changeListener the changeListener to set
*/
public void setIndexChangeListener(IndexChangeListener changeListener) {
this.changeListener = changeListener;
}
/**
* @return the changeListener
*/
protected IndexChangeListener getIndexChangeListener() {
return changeListener;
}
/**
*
* @param indexWriter
* @param storage
* @param projectName
* @param modificationStamp
* @param jar path to jar file containing this file or null
* @throws CoreException
* @throws IOException
*/
public void indexStorage(IndexWriter indexWriter, IStorage storage, String projectName,
long modificationStamp, String jar) throws IOException
{
InputStream contents;
try
{
contents = storage.getContents();
}
catch (Exception e)
{
throw new IOException(e);
}
BufferedReader isReader = new BufferedReader(new InputStreamReader(contents));
IPath fullPath = storage.getFullPath();
String ext = fullPath.getFileExtension();
if( ext == null ) ext = NO_VALUE;
Document doc = new Document();
doc.add(createLuceneField(Field.CONTENTS, isReader));
doc.add(createLuceneField(Field.FILE, fullPath.toString()));
doc.add(createLuceneField(Field.PROJ, projectName));
doc.add(createLuceneField(Field.NAME, fullPath.lastSegment()));
doc.add(createLuceneField(Field.EXT, ext.toLowerCase(Locale.ENGLISH)));
doc.add(createLuceneField(Field.MODIFIED, Long.toString(modificationStamp)));
doc.add(createLuceneField(Field.JAR, (jar==null)?NO_VALUE:jar));
indexWriter.addDocument(doc);
}
private static void runRetryingRunnable(RetryingRunnable runnable) throws Exception
{
Throwable lastException = null;
for(int i = 1; i <= MAX_RETRY_ATTEMPTS; i++)
{
try
{
runnable.run();
lastException = null;
break;
}
catch(Throwable e) // exception during run occured
{
lastException = e;
if( ! runnable.handleException(e) )
break;
}
try {
Thread.sleep( i*1000 ); // wait a bit longer each time for files to be freed
} catch(Exception e) {
break;
}
}
if( lastException != null )
{
if( lastException instanceof Exception )
throw (Exception)lastException;
else
throw new Exception(lastException);
}
}
/**
* Makes several attempts to index storage.
* Occasionally the index files get locked (by other processes) and are temporarily not writable.
*
* @param indexWriter
* @param storage
* @param projectName
* @param modificationStamp
* @param jar
* @throws IOException
*/
protected void indexStorageWithRetry(final IndexWriter indexWriter, final IStorage storage,
final String projectName, final long modificationStamp, final String jar) throws Exception
{
RetryingRunnable runnable = new RetryingRunnable()
{
public void run() throws Exception
{
indexStorage(indexWriter, storage, projectName, modificationStamp, jar);
}
public boolean handleException(Throwable e)
{
if( e instanceof OutOfMemoryError )
{
if( indexWriter.getMaxFieldLength() > IndexWriter.DEFAULT_MAX_FIELD_LENGTH )
indexWriter.setMaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH); // use less memory
else
return false;
}
else if( e instanceof IOException)
{
changeListener.onIndexReset(); // close searcher
}
return true; // keep retrying
}
};
runRetryingRunnable(runnable);
}
public interface RetryingRunnable
{
public void run() throws Exception;
/**
* If exception occurs during run()
*
* @param e
* @return true if should run again, false if stop and re-throw exception
*/
public boolean handleException(Throwable e);
}
public void deleteStorage(IStorage storage) throws Exception
{
IndexReader reader = IndexReader.open(getIndexDir(), false);
String filePath = storage.getFullPath().toString();
Term term = Field.FILE.createTerm(filePath);
reader.deleteDocuments(term);
reader.close();
}
private static org.apache.lucene.document.Field createLuceneField(Field fieldName, String value) {
return new org.apache.lucene.document.Field(fieldName.toString(), value,
Store.YES, org.apache.lucene.document.Field.Index.NOT_ANALYZED);
}
private static org.apache.lucene.document.Field createLuceneField(Field fieldName, Reader reader) {
return new org.apache.lucene.document.Field(fieldName.toString(),
reader, TermVector.YES);
}
/**
* Extracts terms from text
*
* @param text
* @return a map of terms to their offsets in text
* @throws IOException
*/
public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException {
Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>();
TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text));
TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);
while(tokenStream.incrementToken())
{
String termText = termAtt.term().toLowerCase(Locale.ENGLISH);// t.termText().toLowerCase(Locale.ENGLISH);
int offset = offsetAtt.startOffset();
List<Integer> offsets = terms.get(termText);
if( offsets == null ) {
offsets = new LinkedList<Integer>();
terms.put(termText, offsets);
}
offsets.add(offset);
}
tokenStream.close();
return terms;
}
/**
* Listener that gets called when index has changed
*/
public interface IndexChangeListener
{
/** Index was updated with files or files were removed */
public void onIndexUpdate();
/** Index was reset - created or deleted */
public void onIndexReset();
}
/** Empty implementation to void null checks (Null Object pattern) */
private static class NullIndexChangeListener implements IndexChangeListener
{
public void onIndexUpdate() {}
public void onIndexReset() {}
}
}